## different dataframes - Tara
hotels <- read_csv("hotel_booking.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## hotel = col_character(),
## arrival_date_month = col_character(),
## meal = col_character(),
## country = col_character(),
## market_segment = col_character(),
## distribution_channel = col_character(),
## reserved_room_type = col_character(),
## assigned_room_type = col_character(),
## deposit_type = col_character(),
## agent = col_character(),
## company = col_character(),
## customer_type = col_character(),
## reservation_status = col_character(),
## reservation_status_date = col_date(format = "")
## )
## ℹ Use `spec()` for the full column specifications.
hotels_refined = hotels %>% select(hotel, arrival_date_month, arrival_date_day_of_month, arrival_date_year, country ,adr, is_canceled) %>% rename(Hotel_Type = hotel, Country_of_Origin = country, Arrival_month = arrival_date_month, Arrival_date = arrival_date_day_of_month, Arrival_year = arrival_date_year, Average_Daily_Rate = adr)
month_levels <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
repeat_levels <- c(0, 1)
hotels_refined$Arrival_month = factor(hotels_refined$Arrival_month, levels = month_levels)
city_hotels = hotels_refined %>% filter(Hotel_Type == "City Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")
resort_hotels = hotels_refined %>% filter(Hotel_Type == "Resort Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")
city_hotels_freq = city_hotels%>% filter(Average_Daily_Rate != 0.00)%>% group_by(Country_of_Origin, Arrival_month) %>% summarise(n = n()) %>% mutate(frequency = n/sum(n))
## `summarise()` has grouped output by 'Country_of_Origin'. You can override using the `.groups` argument.
#different dataframes - David
df1 = df %>%
arrange(arrival_date_month) %>%
select(lead_time, arrival_date_month) %>%
group_by(arrival_date_month) %>%
summarize(avg_lead_time=mean(lead_time)) %>%
ungroup()
hotels_refined = hotels %>% select(hotel, arrival_date_month, arrival_date_day_of_month, arrival_date_year, country ,adr, is_canceled) %>% rename(Hotel_Type = hotel, Country_of_Origin = country, Arrival_month = arrival_date_month, Arrival_date = arrival_date_day_of_month, Arrival_year = arrival_date_year, Average_Daily_Rate = adr)
month_levels <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
repeat_levels <- c(0, 1)
hotels_refined$Arrival_month = factor(hotels_refined$Arrival_month, levels = month_levels)
city_hotels = hotels_refined %>% filter(Hotel_Type == "City Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")
resort_hotels = hotels_refined %>% filter(Hotel_Type == "Resort Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")
city_hotels_freq = city_hotels%>% filter(Average_Daily_Rate != 0.00)%>% group_by(Country_of_Origin, Arrival_month) %>% summarise(n = n()) %>% mutate(frequency = n/sum(n))
## `summarise()` has grouped output by 'Country_of_Origin'. You can override using the `.groups` argument.
resort_hotels_freq = resort_hotels %>% filter(Average_Daily_Rate != 0.00)%>% group_by(Country_of_Origin, Arrival_month) %>% summarise(n = n()) %>% mutate(frequency = n/sum(n))
## `summarise()` has grouped output by 'Country_of_Origin'. You can override using the `.groups` argument.
city_hotels_adr = city_hotels %>% group_by(Country_of_Origin, Arrival_month) %>% arrange(Country_of_Origin, Arrival_month) %>% summarize_at(vars(Average_Daily_Rate), list( ~mean(., na.rm = TRUE) )) %>% rename(Average_Monthly_Rate = Average_Daily_Rate)
resort_hotels_adr = resort_hotels %>% group_by(Country_of_Origin, Arrival_month) %>% arrange(Country_of_Origin, Arrival_month) %>% summarize_at(vars(Average_Daily_Rate), list( ~mean(., na.rm = TRUE) )) %>% rename(Average_Monthly_Rate = Average_Daily_Rate)
city_hotels_top = city_hotels %>% group_by(Country_of_Origin) %>% summarise(Num_Customers = n()) %>% arrange(desc(Num_Customers)) %>% head(10)
city_hotels_join = inner_join(city_hotels_adr, city_hotels_top, by = "Country_of_Origin")
city_hotels_join
## # A tibble: 120 x 4
## # Groups: Country_of_Origin [10]
## Country_of_Origin Arrival_month Average_Monthly_Rate Num_Customers
## <chr> <fct> <dbl> <int>
## 1 BEL January 84.8 1479
## 2 BEL February 92.1 1479
## 3 BEL March 97.1 1479
## 4 BEL April 115. 1479
## 5 BEL May 132. 1479
## 6 BEL June 123. 1479
## 7 BEL July 124. 1479
## 8 BEL August 122. 1479
## 9 BEL September 128. 1479
## 10 BEL October 120. 1479
## # … with 110 more rows
resort_hotels_top = resort_hotels %>% group_by(Country_of_Origin) %>% summarise(Num_Customers = n()) %>% arrange(desc(Num_Customers)) %>% head(10)
resort_hotels_join = inner_join(resort_hotels_adr, resort_hotels_top, by = "Country_of_Origin")
resort_hotels_join
## # A tibble: 120 x 4
## # Groups: Country_of_Origin [10]
## Country_of_Origin Arrival_month Average_Monthly_Rate Num_Customers
## <chr> <fct> <dbl> <int>
## 1 BEL January 58.2 389
## 2 BEL February 54.8 389
## 3 BEL March 50.1 389
## 4 BEL April 67.6 389
## 5 BEL May 89.0 389
## 6 BEL June 98.7 389
## 7 BEL July 174. 389
## 8 BEL August 195. 389
## 9 BEL September 96.5 389
## 10 BEL October 68.2 389
## # … with 110 more rows
city_hotels_join %>% ggplot(aes(x = Arrival_month, y = Average_Monthly_Rate, color = Country_of_Origin)) + geom_point() + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("City Hotels")
resort_hotels_join %>% ggplot() + geom_point(mapping = aes(x = Arrival_month, y = Average_Monthly_Rate, color = Country_of_Origin)) + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("Resort Hotels")
city_hotels_freq = city_hotels_freq %>% filter(n > 80, frequency < 0.6)
city_hotels_freq %>% ggplot() + geom_point(mapping = aes(x = Arrival_month, y = frequency, color = Country_of_Origin)) + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("City Hotel")
resort_hotels_freq = resort_hotels_freq %>% filter(n > 80)
resort_hotels_freq %>% ggplot() + geom_point(mapping = aes(x = Arrival_month, y = frequency, color = Country_of_Origin)) + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5))+ ggtitle("Resort Hotel")
city_hotels2 = hotels %>% filter(hotel == "City Hotel", is_canceled == 0) %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month,stays_in_week_nights, stays_in_weekend_nights, adr) %>% mutate(length_of_stay = stays_in_week_nights + stays_in_weekend_nights) %>% arrange(arrival_date_year, arrival_date_month, arrival_date_day_of_month)
city_hotels2$arrival_date_month = as.integer(factor(city_hotels2$arrival_date_month, levels = month.name))
city_hotels2 = city_hotels2 %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month, length_of_stay, adr) %>% unite("arrival_date", c("arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"), sep = "/") %>% filter(adr != 0.00)
city_hotels2$arrival_date <- as.Date(city_hotels2$arrival_date)
city_hotels2 = city_hotels2 %>% arrange(arrival_date) %>% group_by(arrival_date) %>% summarise(avg_length_of_stay = mean(length_of_stay), avg_adr = mean(adr))
city_hotels2 %>% ggplot() + geom_point(mapping = aes(x = arrival_date, y = avg_adr, color = avg_length_of_stay)) + ggtitle("City Hotels")
city_hotels2
## # A tibble: 785 x 3
## arrival_date avg_length_of_stay avg_adr
## <date> <dbl> <dbl>
## 1 2015-07-01 2.09 96.6
## 2 2015-07-02 3 58.7
## 3 2015-07-03 2.5 74.5
## 4 2015-07-04 4 63.8
## 5 2015-07-06 1 66.1
## 6 2015-07-07 6 69.2
## 7 2015-07-08 2.27 64.8
## 8 2015-07-09 5 58.9
## 9 2015-07-10 4.5 93.3
## 10 2015-07-11 2.74 87.1
## # … with 775 more rows
resort_hotels2 = hotels %>% filter(hotel == "Resort Hotel", is_canceled == 0) %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month,stays_in_week_nights, stays_in_weekend_nights, adr) %>% mutate(length_of_stay = stays_in_week_nights + stays_in_weekend_nights) %>% arrange(arrival_date_year, arrival_date_month, arrival_date_day_of_month)
resort_hotels2$arrival_date_month = as.integer(factor(resort_hotels2$arrival_date_month, levels = month.name))
resort_hotels2 = resort_hotels2 %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month, length_of_stay, adr) %>% unite("arrival_date", c("arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"), sep = "/") %>% filter(adr != 0.00)
resort_hotels2$arrival_date <- as.Date(resort_hotels2$arrival_date)
resort_hotels2 = resort_hotels2 %>% arrange(arrival_date) %>% group_by(arrival_date) %>% summarise(avg_length_of_stay = mean(length_of_stay), avg_adr = mean(adr))
resort_hotels2 %>% ggplot() + geom_point(mapping = aes(x = arrival_date, y = avg_adr, color = avg_length_of_stay)) + ggtitle("Resort Hotels")
resort_hotels2
## # A tibble: 793 x 3
## arrival_date avg_length_of_stay avg_adr
## <date> <dbl> <dbl>
## 1 2015-07-01 4.94 93.2
## 2 2015-07-02 5.77 100.
## 3 2015-07-03 5 109.
## 4 2015-07-04 5.77 97.6
## 5 2015-07-05 6.11 108.
## 6 2015-07-06 6.26 112.
## 7 2015-07-07 5.08 121.
## 8 2015-07-08 5.95 105.
## 9 2015-07-09 4.43 101.
## 10 2015-07-10 5.56 124.
## # … with 783 more rows
model_data_city = city_hotels2
start_date = as.Date("2015-07-01")
model_data_city$arrival_date <- as.numeric(difftime(model_data_city$arrival_date, start_date, unit = "days"))
NumDays.city <- model_data_city$arrival_date
xc <- cos(2*pi*NumDays.city/366)
xs <- sin(2*pi*NumDays.city/366)
fit.lm <- lm(model_data_city$avg_adr ~ xc + xs)
fit <- fitted(fit.lm)
summary(fit.lm)
##
## Call:
## lm(formula = model_data_city$avg_adr ~ xc + xs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.408 -9.783 -0.760 11.136 75.582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.2384 0.6420 160.813 <2e-16 ***
## xc 17.8297 0.8970 19.877 <2e-16 ***
## xs 0.7110 0.9177 0.775 0.439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.92 on 782 degrees of freedom
## Multiple R-squared: 0.3373, Adjusted R-squared: 0.3356
## F-statistic: 199 on 2 and 782 DF, p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.city = NumDays.city))
plot(model_data_city$avg_adr ~ NumDays.city, data= model_data_city, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.city, pred, col="blue")
NumDays.city <- model_data_city$arrival_date
Avg_ADR.city <- model_data_city$avg_adr
fit.lm <- lm(Avg_ADR.city ~ poly(NumDays.city, 6, raw=TRUE))
fit <- fitted(fit.lm)
summary(fit.lm)
##
## Call:
## lm(formula = Avg_ADR.city ~ poly(NumDays.city, 6, raw = TRUE))
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.554 -7.989 -1.215 6.440 78.009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.568e+01 3.466e+00 16.06 <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)1 1.642e+00 1.160e-01 14.15 <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)2 -2.197e-02 1.254e-03 -17.52 <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)3 1.172e-04 5.898e-06 19.87 <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)4 -2.860e-07 1.348e-08 -21.22 <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)5 3.225e-10 1.473e-11 21.89 <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)6 -1.363e-13 6.166e-15 -22.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.81 on 778 degrees of freedom
## Multiple R-squared: 0.6628, Adjusted R-squared: 0.6602
## F-statistic: 254.9 on 6 and 778 DF, p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.city = NumDays.city))
plot(Avg_ADR.city ~ NumDays.city, data= model_data_city, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.city, pred, col="blue")
df %>% select(adr, reservation_status, days_in_waiting_list, reserved_room_type, assigned_room_type, market_segment, country_name, stays_in_week_nights, stays_in_weekend_nights, arrival_date_year, lead_time, hotel) %>%
filter(days_in_waiting_list < 300, adr <200, days_in_waiting_list != 0) %>%
ggplot(aes(days_in_waiting_list, adr)) + geom_point(aes(color=reserved_room_type, alpha =0.4)) + geom_smooth(method="lm", se=T) + facet_wrap(~hotel, nrow=2) +labs(title = "ADR vs. Days on the Waitlist",
x = "Days on the Waitlist",
y = "ADR")+
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5)) + scale_alpha(guide = 'none')
## `geom_smooth()` using formula 'y ~ x'
df[df$deposit_type== "Non Refund", ] %>% group_by(customer_type, is_canceled ) %>% mutate(count = n()) %>%
ggplot(aes(x=reorder(customer_type, -count), y=count)) + geom_bar(stat = "identity", position = position_dodge()) + theme_bw() +
labs(title = "Cancelled Non-Refund bookings by Customer Type",
x = "Customer Type",
y = "Count") +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
ggplot(df1) +
geom_line(aes(
x=arrival_date_month,
y=avg_lead_time,
group=1
)) +
ggtitle("Average Lead Time vs Arrival Month") +
xlab("Arrival Month") +
ylab("Average Lead Time (Days)") +
theme(axis.text.x=element_text(angle=30, vjust=1,),
plot.title=element_text(hjust=0.5))
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
leadgraph = df %>%
group_by(as.logical(is_canceled)) %>%
summarize(avg_lead_time = mean(lead_time)) %>%
ungroup() %>%
rename(value=avg_lead_time)
leadgraph$key=rep("Average Lead Time", 2)
prevgraph = df %>%
group_by(as.logical(is_canceled)) %>%
summarize(avg_prev_canc = mean(previous_cancellations)) %>%
ungroup() %>%
rename(value=avg_prev_canc)
prevgraph$key=rep("Average Previous Cancellations", 2)
prevuncancgraph = df %>%
group_by(as.logical(is_canceled)) %>%
summarize(avg_prev_uncanc = mean(previous_bookings_not_canceled)) %>%
ungroup() %>%
rename(value=avg_prev_uncanc)
prevuncancgraph$key=rep("Average Previous Bookings Not Canceled", 2)
adrgraph = df %>%
group_by(as.logical(is_canceled)) %>%
summarize(avg_adr = mean(adr)) %>%
ungroup() %>%
rename(value=avg_adr)
adrgraph$key=rep("Average ADR", 2)
fullgraph = bind_rows(leadgraph, prevgraph, prevuncancgraph, adrgraph)
ggplot(fullgraph, aes(x=`as.logical(is_canceled)`, y=value)) +
geom_col() +
facet_wrap(~key, scales = "free")
pvals = c()
pvals = c(pvals, t.test(df$lead_time~as.logical(df$is_canceled))$p.value[[1]])
pvals = c(pvals, t.test(df$previous_cancellations~as.logical(df$is_canceled))$p.value[[1]])
pvals = c(pvals, t.test(df$previous_bookings_not_canceled~as.logical(df$is_canceled))$p.value[[1]])
pvals = c(pvals, t.test(df$adr~as.logical(df$is_canceled))$p.value[[1]])
vars = c("Average Lead Time",
"Average Previous Cancellations",
"Average Previous Bookings Not Canceled",
"Average ADR")
pvaltable = data.frame(Variables = vars, "P values" = pvals)
pvaltable$P.values = as.character(pvaltable$P.values)
to_print = pvaltable %>%
xtable(align="ccc")
print(to_print,
"html",
html.table.attributes="align='center',
rules='rows',
width=50%,
frame='hsides',
border-spacing=5px"
)
## <!-- html table generated in R 4.1.0 by xtable 1.8-4 package -->
## <!-- Thu Jul 29 23:35:10 2021 -->
## <table align='center',
## rules='rows',
## width=50%,
## frame='hsides',
## border-spacing=5px>
## <tr> <th> </th> <th> Variables </th> <th> P.values </th> </tr>
## <tr> <td align="center"> 1 </td> <td align="center"> Average Lead Time </td> <td align="center"> 0 </td> </tr>
## <tr> <td align="center"> 2 </td> <td align="center"> Average Previous Cancellations </td> <td align="center"> 3.44419715019362e-196 </td> </tr>
## <tr> <td align="center"> 3 </td> <td align="center"> Average Previous Bookings Not Canceled </td> <td align="center"> 5.87710671289984e-129 </td> </tr>
## <tr> <td align="center"> 4 </td> <td align="center"> Average ADR </td> <td align="center"> 9.7601257426029e-59 </td> </tr>
## </table>
hotelgraph = df %>%
group_by(hotel) %>%
summarize(
n=n(),
num_canceled = sum(is_canceled),
prop_canceled = mean(is_canceled)
) %>%
ungroup() %>%
rename(value=hotel)
hotelgraph$key=rep("Hotel", 2)
repeatgraph = df %>%
group_by(as.logical(is_repeated_guest)) %>%
summarize(
n=n(),
num_canceled = sum(is_canceled),
prop_canceled = mean(is_canceled)
) %>%
ungroup() %>%
rename(value=`as.logical(is_repeated_guest)`)
repeatgraph$key=rep("Is Repeated Guest?", 2)
repeatgraph$value=as.character(repeatgraph$value)
fullgraph2 = bind_rows(hotelgraph, repeatgraph)
ggplot(fullgraph2, aes(x=value, y=prop_canceled)) +
geom_col() +
facet_wrap(~key, scales="free")
detach("package:MASS", unload = TRUE)
hotel_bookings <- read_csv("hotel_booking.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## hotel = col_character(),
## arrival_date_month = col_character(),
## meal = col_character(),
## country = col_character(),
## market_segment = col_character(),
## distribution_channel = col_character(),
## reserved_room_type = col_character(),
## assigned_room_type = col_character(),
## deposit_type = col_character(),
## agent = col_character(),
## company = col_character(),
## customer_type = col_character(),
## reservation_status = col_character(),
## reservation_status_date = col_date(format = "")
## )
## ℹ Use `spec()` for the full column specifications.
pop_Mean_canceled = mean(hotel_bookings$is_canceled)
pop_Mean_canceled #our population sample
## [1] 0.3704163
hotel_bookings$cumul_cancellations <- hotel_bookings$previous_cancellations+hotel_bookings$is_canceled
hotel_bookings$cumul_bookings <- hotel_bookings$previous_bookings_not_canceled+1-hotel_bookings$is_canceled
hotel_bookings$customer_cancellation_rate_total <- hotel_bookings$cumul_cancellations/(hotel_bookings$cumul_cancellations +hotel_bookings$cumul_bookings)
fcustomer_cancellation_rate_total <-tibble(
filter(hotel_bookings,customer_cancellation_rate_total>0&customer_cancellation_rate_total<1)
)
fcustomer_cancellation_rate_total$cancellation_rate <- fcustomer_cancellation_rate_total$cumul_cancellations/(fcustomer_cancellation_rate_total$cumul_bookings+fcustomer_cancellation_rate_total$cumul_cancellations)
ggplot(data=fcustomer_cancellation_rate_total)+geom_histogram(mapping=aes(cancellation_rate),fill="red",color="white")+geom_vline(xintercept=pop_Mean_canceled,color="red")+
labs(title = "Cancellation Rates by Customer",
x = "Cancellation Rate",
y = "Frequency") +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
agentDist <- filter(hotel_bookings,hotel_bookings$distribution_channel=="TA/TO")
agentDist$cancellation_rate <- agentDist$cumul_cancellations /(agentDist$cumul_bookings+agentDist$cumul_cancellations)
agent_sampl_Mean <- mean(agentDist$is_canceled)
agent_sampl_Mean #agents-only sample
## [1] 0.4102585
table <- agentDist %>%
group_by(agent) %>%
summarize(mean = mean(as.integer(is_canceled)), sum = sum(as.integer(is_canceled)))
tableB <- filter(table,mean>0&mean<1)
ggplot(data=tableB)+geom_histogram(mapping=aes(mean),color="white",fill="purple")+geom_vline(xintercept=agent_sampl_Mean,color="red")+geom_vline(xintercept=pop_Mean_canceled,color="purple")+
labs(title = "Cancellation Rate, per Agent",
x = "Cancellation Rate",
y = "Frequency") +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nonAgentDist <- tibble(
filter(hotel_bookings,agent=="NULL"&customer_cancellation_rate_total>0&customer_cancellation_rate_total<1)
)
nonAgentMean <- mean(nonAgentDist$customer_cancellation_rate_total)
random_sample_rates <- tibble(tableB)
random_sample_rates$randPop <- sample(fcustomer_cancellation_rate_total$cancellation_rate,168)
random_sample_rates$randNon <- sample(nonAgentDist$customer_cancellation_rate_total,168)
random_sample_rates$randNon <- sample(nonAgentDist$customer_cancellation_rate_total,168)
ggplot(data=random_sample_rates)+geom_histogram(mapping=aes(tableB$mean),color="white",fill="red")+geom_histogram(mapping=aes(randPop),color="white",fill="purple")+geom_histogram(mapping=aes(randNon),color="white",fill="blue")+geom_vline(xintercept=agent_sampl_Mean,color="red")+geom_vline(xintercept=pop_Mean_canceled,color="purple")+geom_vline(xintercept=nonAgentMean,color="blue")+labs(title = "Cancellation Rate, per Agent (red), per non-Agent (blue), and population (purple)",
x = "Cancellation Rate",
y = "Frequency") + theme(panel.background = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Therefore, our data implies that agents (represented in red) consistently cancel more nonAgents (blue) and the general population of both (purple).
adr_pop_Mean = mean(hotel_bookings$adr)
# mean for the population is 101.8311
adr_revised<-hotel_bookings %>% filter(adr != 5400 & adr != 510)
ggplot(data=adr_revised)+geom_histogram(mapping=aes(adr),fill="blue",color="white")+geom_vline(xintercept = adr_pop_Mean,color="blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
child <- as.integer(hotel_bookings$children)
hotel_bookings$childCount = child
withoutChild<-tibble(filter(hotel_bookings,child=="0"&adr != 5400 & adr != 510))
adr_sampl_Mean = mean(withoutChild$adr)
# mean for the childless is 97.47
ggplot(data=withoutChild)+geom_histogram(mapping=aes(adr),color="white",fill="red")+geom_vline(xintercept = adr_sampl_Mean,color="red")+geom_vline(xintercept = adr_pop_Mean,color="blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Both histograms appear extremely similar, but as our means are significantly different given our dataset, we can conclude that, on average, groups that have children have a lower ADR than those without.
Our group decided to investigate Q1 and Q2 in further detail.
model_data_resort = resort_hotels2
model_data_resort$arrival_date <- as.numeric(difftime(model_data_resort$arrival_date, start_date, unit = "days"))
NumDays.resort <- model_data_resort$arrival_date
xc <- cos(2*pi*NumDays.resort/366)
xs <- sin(2*pi*NumDays.resort/366)
fit.lm <- lm(model_data_resort$avg_adr ~ xc + xs)
fit <- fitted(fit.lm)
summary(fit.lm)
##
## Call:
## lm(formula = model_data_resort$avg_adr ~ xc + xs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.645 -21.414 -1.789 14.524 155.845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.456 1.062 84.24 <2e-16 ***
## xc 49.277 1.478 33.34 <2e-16 ***
## xs 21.450 1.523 14.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.75 on 790 degrees of freedom
## Multiple R-squared: 0.6327, Adjusted R-squared: 0.6317
## F-statistic: 680.3 on 2 and 790 DF, p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.resort = NumDays.resort))
plot(model_data_resort$avg_adr ~ NumDays.resort, data= model_data_resort, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.resort, pred, col="blue")
Avg_ADR.resort <- model_data_resort$avg_adr
fit.lm <- lm(Avg_ADR.resort ~ poly(NumDays.resort, 6, raw=TRUE))
fit <- fitted(fit.lm)
summary(fit.lm)
##
## Call:
## lm(formula = Avg_ADR.resort ~ poly(NumDays.resort, 6, raw = TRUE))
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.36 -17.84 -1.92 12.20 137.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.232e+02 6.288e+00 19.590 < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)1 1.290e+00 2.207e-01 5.844 7.45e-09 ***
## poly(NumDays.resort, 6, raw = TRUE)2 -3.257e-02 2.434e-03 -13.379 < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)3 2.013e-04 1.156e-05 17.418 < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)4 -5.151e-07 2.656e-08 -19.390 < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)5 5.862e-10 2.913e-11 20.123 < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)6 -2.454e-13 1.222e-14 -20.082 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.68 on 786 degrees of freedom
## Multiple R-squared: 0.7277, Adjusted R-squared: 0.7256
## F-statistic: 350.1 on 6 and 786 DF, p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.resort = NumDays.resort))
plot(Avg_ADR.resort ~ NumDays.resort, data= model_data_resort, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.resort, pred, col="blue")
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
df_log = df %>%
dplyr::select(hotel,
is_canceled,
lead_time,
previous_cancellations,
previous_bookings_not_canceled,
adr,
is_repeated_guest
) %>% mutate(id=row_number())
set.seed(216)
df_train=df_log %>%
sample_frac(0.80)
df_test=anti_join(df_log, df_train, by='id')
df_train= df_train %>%
subset(select = -id)
df_test= df_test %>%
subset(select= -id)
Model 1: Logistic
model1 = glm(
is_canceled~.,
family="binomial",
data=df_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
options("scipen"=100, "digits"=4)
summary(model1)
##
## Call:
## glm(formula = is_canceled ~ ., family = "binomial", data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.490 -0.893 -0.686 1.190 6.578
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.3828643 0.0210942 -65.6
## hotelResort Hotel -0.4658058 0.0158784 -29.3
## lead_time 0.0049329 0.0000715 69.0
## previous_cancellations 2.9149599 0.0577364 50.5
## previous_bookings_not_canceled -0.6372580 0.0288014 -22.1
## adr 0.0037409 0.0001530 24.4
## is_repeated_guest -1.0641241 0.0877746 -12.1
## Pr(>|z|)
## (Intercept) <0.0000000000000002 ***
## hotelResort Hotel <0.0000000000000002 ***
## lead_time <0.0000000000000002 ***
## previous_cancellations <0.0000000000000002 ***
## previous_bookings_not_canceled <0.0000000000000002 ***
## adr <0.0000000000000002 ***
## is_repeated_guest <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 125985 on 95511 degrees of freedom
## Residual deviance: 110568 on 95505 degrees of freedom
## AIC: 110582
##
## Number of Fisher Scoring iterations: 7
df_test = df_test %>% add_predictions(
model1,
var="predicted_canc1") %>%
mutate(predicted_canc1 = ifelse(predicted_canc1 > 0.5,1,0))
Model 2: Stepwise logistic
model2 = stepAIC(model1)
## Start: AIC=110582
## is_canceled ~ hotel + lead_time + previous_cancellations + previous_bookings_not_canceled +
## adr + is_repeated_guest
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 110568 110582
## - is_repeated_guest 1 110769 110781
## - adr 1 111163 111175
## - hotel 1 111490 111502
## - previous_bookings_not_canceled 1 112027 112039
## - lead_time 1 115811 115823
## - previous_cancellations 1 115816 115828
df_test = df_test %>% add_predictions(
model2,
var="predicted_canc2") %>%
mutate(predicted_canc2 = ifelse(predicted_canc2 > 0.5,1,0))
Model 3: Logistic with twofold interaction
model3 = glm(
is_canceled~.^2,
family="binomial",
data=df_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(model3)[,c("term", "estimate", "p.value")]
## # A tibble: 22 x 3
## term estimate p.value
## <chr> <dbl> <dbl>
## 1 (Intercept) -1.45 0
## 2 hotelResort Hotel -0.406 2.36e- 22
## 3 lead_time 0.00585 9.32e-208
## 4 previous_cancellations 2.90 1.29e- 25
## 5 previous_bookings_not_canceled -0.755 4.74e- 10
## 6 adr 0.00404 1.84e- 51
## 7 is_repeated_guest -0.851 3.62e- 8
## 8 hotelResort Hotel:lead_time -0.00119 5.70e- 14
## 9 hotelResort Hotel:previous_cancellations 0.461 1.36e- 2
## 10 hotelResort Hotel:previous_bookings_not_canceled -0.0386 6.32e- 1
## # … with 12 more rows
df_test = df_test %>% add_predictions(
model3,
var="predicted_canc3") %>%
mutate(predicted_canc3 = ifelse(predicted_canc3 > 0.5,1,0))
Model4: k-NN, k=5
#standardize dataset
standardize = function(vector) {
return(sd(vector)*vector +
mean(vector)
)
}
df_knn = df_test %>%
mutate(
previous_cancellations=standardize(previous_cancellations),
previous_bookings_not_canceled=standardize(previous_bookings_not_canceled),
hotel=ifelse(hotel=="Resort Hotel", 1, 0),
hotel=standardize(hotel),
lead_time=standardize(lead_time),
adr=standardize(adr)
) %>%
mutate(predicted_canc4=knn(
train=dplyr::select(
df_train,
lead_time,
adr,
previous_cancellations
),
test=dplyr::select(
df_test,
lead_time,
adr,
previous_cancellations
),
cl=factor(
df_train$is_canceled,
levels=c(0,1),
labels=c("0","1")
),
k=5)
) %>%
dplyr::select(predicted_canc4)
df_test = bind_cols(df_test, df_knn) %>%
mutate(predicted_canc4 = as.integer(predicted_canc4)-1)
Metrics
#input df_test$predicted_canc1, or 2, etc.
sensitivity = function(predicted_canc) {
return(
sum(df_test$is_canceled & predicted_canc) /
sum(df_test$is_canceled)
)
}
specificity = function(predicted_canc) {
return(
sum(!df_test$is_canceled & !predicted_canc) /
sum(!df_test$is_canceled)
)
}
accuracy = function(predicted_canc) {
(
sum(df_test$is_canceled & predicted_canc) +
sum(!df_test$is_canceled & !predicted_canc)
) / nrow(df_test)
}
metrics_table = tribble(
~Model, ~sensitivity, ~specificity, ~accuracy,
1, sensitivity(df_test$predicted_canc1), specificity(df_test$predicted_canc1), accuracy(df_test$predicted_canc1),
2, sensitivity(df_test$predicted_canc2), specificity(df_test$predicted_canc2), accuracy(df_test$predicted_canc2),
3, sensitivity(df_test$predicted_canc3), specificity(df_test$predicted_canc3), accuracy(df_test$predicted_canc3),
4, sensitivity(df_test$predicted_canc4), specificity(df_test$predicted_canc4), accuracy(df_test$predicted_canc4)
)
to_print1 = metrics_table %>%
xtable(align="ccccc")
print(to_print1,
"html",
html.table.attributes="align='center',
rules='rows',
width=50%,
frame='hsides',
border-spacing=5px"
)
| Model | sensitivity | specificity | accuracy | |
|---|---|---|---|---|
| 1 | 1.00 | 0.19 | 0.98 | 0.69 |
| 2 | 2.00 | 0.19 | 0.98 | 0.69 |
| 3 | 3.00 | 0.19 | 0.98 | 0.69 |
| 4 | 4.00 | 0.60 | 0.85 | 0.76 |
detach("package:MASS", unload = TRUE)
Our team looked at hotel booking data. Customers visit more often and rates are higher during the summer. However, customers from some countries, particularly those near Portugal, arrive more frequently during winter months. Longer stays are associated with a lower average daily rate, but the majority of stays range from 2-3 days for both hotels. More than the average length of stay, average daily rate seems to oscillate with time of month more often, and is especially present in the resort hotel. However, stays of 1 day are usually plotted higher on the y axis for both graphs compared to stays of 3 days or longer on the same x axis location (meaning on the same day). Our team next reviewed which percentage of the bookings as a whole are canceled: 37%. When we produce the same observation for an agents-only sample, we find that among agents, 41% of bookings are canceled. Comparing the data, we found that agents consistently cancel more non-agents and the general population of both. For the next question, we created a statistic for the population ADR mean, and then filtered the customers without children into a separate data frame to analyze if significant differences exist between the two groups. On average, we found that groups that have children have a lower ADR than those without. For our first follow up question, we wanted to see the difference between a sinusoidal and polynomial regression for the relationship between arrival date and average daily rate. We used data previously arranged to plot arrival time and average daily rate in a time series. In order to find this model, We needed to tweak the arrival date data. Instead of showing the actual arrival date, the x axis now indicated the number of days since the first recorded date of a customer. For both city and resort data, the sinusoidal curve seems to be a better fit. The R^2 values for the polynomial curve are better for both the resort and city data, as they are higher than the sinusoidal R^2 values. The polynomial used was a sextic function, as all graphs seemed to have 5 inflection points. The oscillations in the graph suggested that this data could be plotted using these functions. The R^2 value for resort data sinusoidal was 0.6317, and the R^2 value for resort data polynomial was 0.7256. The R^2 value for city data sinusoidal was 0.3356, and the R^2 value for city data polynomial was 0.6602, indicating that the sextic function was a better fit for both data. This indicates that while the data oscillates, there are certain arrival dates that strongly forecast lower average daily rates and certain arrival dates that strongly forecast higher average daily rates with a peak in the average daily rate ranging around July of each year. For our next question, we asked which model best predicts cancellation. We made 4 models: Model 1 is logistic regression, Model 2 stepwise logistic regression, Model 3 logistic regression with two-variable interaction, and Model 4 k-NN, with k=5. A table of with metrics measuring the success of these models is above. The k-NN model was the most accurate and the most sensitive, but it was the least specific, compared against the other logistic regression models. To determine whether sensitivity or specificity matters more, we need to consider the practical context of our model. We were thinking our model could be used to help hotel managers create a “fast-track” waitlist of people to replace the individuals who are more likely to cancel. As long as the hotels don’t actually guarantee rooms to folks on the fast-track waitlist, it should not matter whether managers wrongly predict cancellation. That is, it should not matter how “specific” managers are. In this context, we care more so about sensitivity. As such, the k-NN model would be the best fit. We finally explored whether there is a correlation or relationship between the days on the waitlist and the hotel’s Average Daily Revenue (ADR). We found a weak negative relationship between days on the waitlist and the ADR for city hotels and a weak positive relationship between the two variables for resort hotels. Next, we explored how often non-refund bookings canceled. The data showed that the “transient” customer type canceled an overwhelming number of bookings.
model_data_city = city_hotels2
start_date = as.Date("2015-07-01")
model_data_city$arrival_date <- as.numeric(difftime(model_data_city$arrival_date, start_date, unit = "days"))
xc <- cos(2*pi*model_data_city$arrival_date/365.25)
xs <- sin(2*pi*model_data_city$arrival_date/365.25)
fit.lm <- lm(avg_adr ~ xc + xs + arrival_date, data = model_data_city)
model_data_city$pred1 <- predict(fit.lm, model_data_city)
model.func = function(day){
return(
fit.lm$coefficients[[1]] + fit.lm$coefficients[[2]]*cos(2*pi*day/365.25) + fit.lm$coefficients[[3]]*sin(2*pi*day/365.25) + fit.lm$coefficients[[4]]*day
)
}
p1 <- ggplot() + geom_point(data = model_data_city, aes(x = arrival_date, y = avg_adr)) + geom_hline(aes(yintercept=0))
p1.trend = p1 +
geom_line(data = model_data_city, aes(x = arrival_date, y = pred1), color="red", size = 2)
new = data.frame(arrival_date=793:1577)
#new$xc <- cos(2*pi*new$arrival_date/366)
#new$xs <- sin(2*pi*new$arrival_date/366)
#new$pred1 <- model.func(new)
model_data_city = bind_rows(model_data_city, new)
model_data_city$pred <- model.func(model_data_city$arrival_date)
model_data_city
## # A tibble: 1,570 x 5
## arrival_date avg_length_of_stay avg_adr pred1 pred
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 2.09 96.6 95.8 95.8
## 2 1 3 58.7 95.9 95.9
## 3 2 2.5 74.5 96.1 96.1
## 4 3 4 63.8 96.2 96.2
## 5 5 1 66.1 96.5 96.5
## 6 6 6 69.2 96.6 96.6
## 7 7 2.27 64.8 96.7 96.7
## 8 8 5 58.9 96.8 96.8
## 9 9 4.5 93.3 96.9 96.9
## 10 10 2.74 87.1 97.1 97.1
## # … with 1,560 more rows
p1.trend + geom_line(data = model_data_city, aes(x = arrival_date, y = pred)) + geom_vline(xintercept = 792)
#model_data_city = model_data_city %>% select(arrival_date, avg_adr)
#predict(lm(avg_adr ~ xc + xs + model_data_city$arrival_date, data = model_data_city), newdata = pred)
#p1 +
#geom_line(color="blue", data=new) + geom_vline(xintercept = 792)
model_data_resort = resort_hotels2
model_data_resort$arrival_date <- as.numeric(difftime(model_data_resort$arrival_date, start_date, unit = "days"))
NumDays.resort <- model_data_resort$arrival_date
xc <- cos(2*pi*model_data_resort$arrival_date/365.25)
xs <- sin(2*pi*model_data_resort$arrival_date/365.25)
fit.lm <- lm(avg_adr ~ xc + xs + arrival_date, data = model_data_resort)
model_data_resort$pred1 <- predict(fit.lm, model_data_resort)
p2 <- ggplot() + geom_point(data =model_data_resort, aes(x = arrival_date, y = avg_adr)) + geom_hline(aes(yintercept=0))
p2.trend = p2 +
geom_line(data = model_data_resort, aes(x = arrival_date, y = pred1), color="red", size = 2)
new = data.frame(arrival_date=793:1577)
model_data_resort = bind_rows(model_data_resort, new)
model_data_resort$pred <- model.func(model_data_resort$arrival_date)
model_data_resort
## # A tibble: 1,578 x 5
## arrival_date avg_length_of_stay avg_adr pred1 pred
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 4.94 93.2 117. 117.
## 2 1 5.77 100. 117. 117.
## 3 2 5 109. 118. 118.
## 4 3 5.77 97.6 118. 118.
## 5 4 6.11 108. 118. 118.
## 6 5 6.26 112. 119. 119.
## 7 6 5.08 121. 119. 119.
## 8 7 5.95 105. 120. 120.
## 9 8 4.43 101. 120. 120.
## 10 9 5.56 124. 121. 121.
## # … with 1,568 more rows
p2.trend + geom_line(data = model_data_resort, aes(x = arrival_date, y = pred)) + geom_vline(xintercept = 792)